Inlcude all the libraries here:
if (!require("pacman")) install.packages("pacman")
## Loading required package: pacman
pacman::p_load(adabag)
library(leaps)
library(data.table)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(ggplot2)
library(adabag)
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.4.2
## corrplot 0.84 loaded
library(caret)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(tree)
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(rpart)
library(glmnet)
## Warning: package 'glmnet' was built under R version 3.4.2
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-13
##
## Attaching package: 'glmnet'
## The following object is masked from 'package:pROC':
##
## auc
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(corrplot)
Let us first read and understand the data:
datahealth <- read.csv("survey.csv", header=T)
summary(datahealth)
## Timestamp Age Gender
## 2014-08-27 12:31:41: 2 Min. :-1.726e+03 Male :615
## 2014-08-27 12:37:50: 2 1st Qu.: 2.700e+01 male :206
## 2014-08-27 12:43:28: 2 Median : 3.100e+01 Female :121
## 2014-08-27 12:44:51: 2 Mean : 7.943e+07 M :116
## 2014-08-27 12:54:11: 2 3rd Qu.: 3.600e+01 female : 62
## 2014-08-27 14:22:43: 2 Max. : 1.000e+11 F : 38
## (Other) :1247 (Other):101
## Country state self_employed family_history treatment
## United States :751 CA :138 No :1095 No :767 No :622
## United Kingdom:185 WA : 70 Yes : 146 Yes:492 Yes:637
## Canada : 72 NY : 57 NA's: 18
## Germany : 45 TN : 45
## Ireland : 27 TX : 44
## Netherlands : 27 (Other):390
## (Other) :152 NA's :515
## work_interfere no_employees remote_work tech_company
## Never :213 1-5 :162 No :883 No : 228
## Often :144 100-500 :176 Yes:376 Yes:1031
## Rarely :173 26-100 :289
## Sometimes:465 500-1000 : 60
## NA's :264 6-25 :290
## More than 1000:282
##
## benefits care_options wellness_program seek_help
## Don't know:408 No :501 Don't know:188 Don't know:363
## No :374 Not sure:314 No :842 No :646
## Yes :477 Yes :444 Yes :229 Yes :250
##
##
##
##
## anonymity leave mental_health_consequence
## Don't know:819 Don't know :563 Maybe:477
## No : 65 Somewhat difficult:126 No :490
## Yes :375 Somewhat easy :266 Yes :292
## Very difficult : 98
## Very easy :206
##
##
## phys_health_consequence coworkers supervisor
## Maybe:273 No :260 No :393
## No :925 Some of them:774 Some of them:350
## Yes : 61 Yes :225 Yes :516
##
##
##
##
## mental_health_interview phys_health_interview mental_vs_physical
## Maybe: 207 Maybe:557 Don't know:576
## No :1008 No :500 No :340
## Yes : 44 Yes :202 Yes :343
##
##
##
##
## obs_consequence
## No :1075
## Yes: 184
##
##
##
##
##
## comments
## * Small family business - YMMV. : 5
## : 1
## - : 1
## (yes but the situation was unusual and involved a change in leadership at a very high level in the organization as well as an extended leave of absence) : 1
## A close family member of mine struggles with mental health so I try not to stigmatize it. My employers/coworkers also seem compassionate toward any kind of health or family needs.: 1
## (Other) : 155
## NA's :1095
data1 <- datahealth
dim(datahealth)
## [1] 1259 27
names(data1)
## [1] "Timestamp" "Age"
## [3] "Gender" "Country"
## [5] "state" "self_employed"
## [7] "family_history" "treatment"
## [9] "work_interfere" "no_employees"
## [11] "remote_work" "tech_company"
## [13] "benefits" "care_options"
## [15] "wellness_program" "seek_help"
## [17] "anonymity" "leave"
## [19] "mental_health_consequence" "phys_health_consequence"
## [21] "coworkers" "supervisor"
## [23] "mental_health_interview" "phys_health_interview"
## [25] "mental_vs_physical" "obs_consequence"
## [27] "comments"
#CHANGE!!!!
GenMale <- c("cis male", "Cis Male", "Cis Man", "m", "M", "Mail", "maile", "Make", "Mal", "male", "Male", "Male ", "Male (CIS)", "Malr", "Man", "msle")
GenFemale <- c("cis-female/femme", "Cis Female", "f", "F", "femail", "Femake", "female", "Female", "Female ", "Female (cis)", "Female (trans)", "Trans-female", "Trans woman", "woman", "Woman")
# Assigning the entries according to "categories"
data1$newgender <-
ifelse((data1$Gender %in% GenMale), "Male", # Assigning "Male" to those who entered a string contained in GenMale
ifelse((data1$Gender %in% GenFemale), "Female", "Non-M/F")) %>% # Assigning "Female" to those who entered a string contained in GenFemale
as.factor()
# Observing cleaned table
table(data1$newgender)
##
## Female Male Non-M/F
## 251 990 18
#Clean the age column to eliminate spurious values like negatives and ages above 120
data1 = data1[(data1$Age > 15) & (data1$Age < 120),]
dim(data1)
## [1] 1251 28
data1 = subset(data1, select=-c(Gender, Timestamp, comments))
data1 <- data1 %>% rename(Gender = newgender )
names(data1)
## [1] "Age" "Country"
## [3] "state" "self_employed"
## [5] "family_history" "treatment"
## [7] "work_interfere" "no_employees"
## [9] "remote_work" "tech_company"
## [11] "benefits" "care_options"
## [13] "wellness_program" "seek_help"
## [15] "anonymity" "leave"
## [17] "mental_health_consequence" "phys_health_consequence"
## [19] "coworkers" "supervisor"
## [21] "mental_health_interview" "phys_health_interview"
## [23] "mental_vs_physical" "obs_consequence"
## [25] "Gender"
#na.omit(data1)
dim(data1)
## [1] 1251 25
sapply(data1, class)
## Age Country
## "numeric" "factor"
## state self_employed
## "factor" "factor"
## family_history treatment
## "factor" "factor"
## work_interfere no_employees
## "factor" "factor"
## remote_work tech_company
## "factor" "factor"
## benefits care_options
## "factor" "factor"
## wellness_program seek_help
## "factor" "factor"
## anonymity leave
## "factor" "factor"
## mental_health_consequence phys_health_consequence
## "factor" "factor"
## coworkers supervisor
## "factor" "factor"
## mental_health_interview phys_health_interview
## "factor" "factor"
## mental_vs_physical obs_consequence
## "factor" "factor"
## Gender
## "factor"
names(data1)
## [1] "Age" "Country"
## [3] "state" "self_employed"
## [5] "family_history" "treatment"
## [7] "work_interfere" "no_employees"
## [9] "remote_work" "tech_company"
## [11] "benefits" "care_options"
## [13] "wellness_program" "seek_help"
## [15] "anonymity" "leave"
## [17] "mental_health_consequence" "phys_health_consequence"
## [19] "coworkers" "supervisor"
## [21] "mental_health_interview" "phys_health_interview"
## [23] "mental_vs_physical" "obs_consequence"
## [25] "Gender"
data1$work_interfere <- as.character(data1$work_interfere)
data1$work_interfere[is.na(data1$work_interfere)] <- "Never"
data1$work_interfere <- as.factor(data1$work_interfere)
summary(data1$work_interfere)
## Never Often Rarely Sometimes
## 474 140 173 464
Let us see the distribution of data with respect to tech and non-tech companies:
bar <- ggplot(data=data1, aes(x = sum(tech_company =="Yes"), fill = tech_company)) + geom_bar(width = 0.2) +coord_fixed(ratio = 0.2)
pie <- bar + coord_polar("y", start=0) +theme_void()
pie
Clearly, our data is skewed in favor of the tech companies.
Let us se the distribution of data with respect to the number individuals seeking treatment for mental illnesses:
bar <- ggplot(data=data1, aes(x = sum(treatment =="Yes"), fill = treatment)) + geom_bar(width = 0.2) +coord_fixed(ratio = 0.2)
pie <- bar + coord_polar("y", start=0) + theme_void() + scale_fill_manual(values=c("#999999", "#E69F00"))
pie
We have close to even distribution of data with respect to individuals seeking treatment.
What is the percentage of folks with a Family history of mental illnesses?
colors1 <- c("No" = "#fffff", "Yes" = "qqqqq", "Maybe" = "#11111", "Not sure" = "#11111", "Don't know" = "#11111")
data1 %>%
count(family_history) %>%
plot_ly(
labels = ~family_history,
values = ~n,
type = "pie",
textposition = 'inside',
textinfo = 'label+percent',
hoverinfo = 'text', # Setting text on hover (see text variable on next line)
text = ~paste(n, "Respondents"), # Setting text on hover
marker = list(colors = colors1)) %>% # Setting up colors for clarity
layout(title = "Responses")
Do the ones with a family history of mental illness seek treatment?
ggplot(data=data1, aes(x=family_history, fill = treatment)) +geom_bar() +theme_light() +scale_fill_brewer(palette="Set2")
If the worker is willing to discuss the mental health issue with the supervisor, is he or she more probable to seek treatment?
ggplot(data=data1, aes(x=supervisor, fill = treatment)) +geom_bar() +theme_light() +scale_fill_brewer(palette="Set3")
Does the anonymity of the worker affect the individual seeking treatment?
ggplot(data=data1, aes(x=anonymity, fill = treatment)) +geom_bar() +theme_light() +scale_fill_brewer(palette="Set2")
Do the consequences of seeking mental help affect the worker seeking treatment?
ggplot(data=data1, aes(x=obs_consequence, fill = treatment)) +geom_bar() +theme_light() +scale_fill_brewer(palette="Set1")
How seriously are issues related to mental health taken in comparison to physical health, in tech and non-tech companies:
ggplot(data=data1, aes(x=mental_vs_physical, fill = tech_company)) +geom_bar() +theme_light() +scale_fill_brewer(palette="Set2")
Some functions that can be resued later.
# #Define some functions that can be resued later.
#
# getNumericColumns<-function(t){
# tn = sapply(t,function(x){is.numeric(x)})
# return(names(tn)[which(tn)])
# }
#
# getFactorColumns<-function(t){
# tn = sapply(t,function(x){is.factor(x)})
# return(names(tn)[which(tn)])
# }
Out of the 1251 samples, we are reserving 1000 samples for training and 251 samples for testing.
set.seed(1)
n <- nrow(data1)
train.index <- sample(n,1000)
health.train <- data1[train.index,]
health.test <- data1[-train.index,]
x.train <- health.train[,-6]
y.train <- health.train$treatment
x.test <- health.test[,-6]
y.test <- health.test$treatment
#Creating a dataframe to save results of each method in order to plot a graph
success <- data.frame(methods=c("Logistic Regression","Single Tree", "Random Forest","Bagging","Neural Nets"), percentages=c(0,0,0,0,0))
Logistic regression:
fit0 <- glm(treatment~ ., data = health.train, family=binomial(logit))
Anova(fit0) #Perform Anova to get significant variables
## Analysis of Deviance Table (Type II tests)
##
## Response: treatment
## LR Chisq Df Pr(>Chisq)
## Age 0.003 1 0.9568538
## Country 0.560 3 0.9056227
## state 49.069 42 0.2107834
## self_employed 2.427 1 0.1192484
## family_history 13.176 1 0.0002835 ***
## work_interfere 221.559 3 < 2.2e-16 ***
## no_employees 10.483 5 0.0626501 .
## remote_work 0.050 1 0.8235812
## tech_company 2.467 1 0.1162323
## benefits 9.899 2 0.0070874 **
## care_options 10.121 2 0.0063423 **
## wellness_program 0.841 2 0.6567468
## seek_help 7.612 2 0.0222392 *
## anonymity 10.474 2 0.0053164 **
## leave 1.530 4 0.8213834
## mental_health_consequence 4.036 2 0.1328910
## phys_health_consequence 1.303 2 0.5213697
## coworkers 4.325 2 0.1150401
## supervisor 2.316 2 0.3140699
## mental_health_interview 2.352 2 0.3084630
## phys_health_interview 0.554 2 0.7578853
## mental_vs_physical 2.286 2 0.3188246
## obs_consequence 0.034 1 0.8540161
## Gender 4.035 2 0.1330195
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Since state and self_employed have NA values but are not significant at the 0.05 level, we can remove these columns from our data.
data1 <- data1[, -c(3,4)]
health.train <- health.train[, -c(3,4)]
health.test <- health.test[, -c(3,4)]
x.train <- x.train[, -c(3,4)]
x.test <- x.test[, -c(3,4)]
Picking out only the significant variables, we get a better model with the variables - family_history, work_interfere, benefits, care_options, seek_help, anonymity.
fit1 <- glm(treatment ~ family_history + work_interfere + benefits + care_options + seek_help + anonymity, data = health.train, family=binomial(logit))
Anova(fit1) #Anonymity is not significant. Remove it.
## Analysis of Deviance Table (Type II tests)
##
## Response: treatment
## LR Chisq Df Pr(>Chisq)
## family_history 31.66 1 1.834e-08 ***
## work_interfere 385.89 3 < 2.2e-16 ***
## benefits 12.86 2 0.001609 **
## care_options 10.51 2 0.005212 **
## seek_help 7.02 2 0.029952 *
## anonymity 4.81 2 0.090433 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit2 <- glm(treatment ~ family_history + work_interfere + benefits + care_options + seek_help , data = health.train, family=binomial(logit))
Anova(fit2) #seek_help is not significant. Remove it.
## Analysis of Deviance Table (Type II tests)
##
## Response: treatment
## LR Chisq Df Pr(>Chisq)
## family_history 32.48 1 1.202e-08 ***
## work_interfere 383.88 3 < 2.2e-16 ***
## benefits 14.83 2 0.0006009 ***
## care_options 14.38 2 0.0007522 ***
## seek_help 5.85 2 0.0535892 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit3 <- glm(treatment ~ family_history + work_interfere + benefits + care_options , data = health.train, family=binomial(logit))
Anova(fit3) #All variables significant at 0.05 level
## Analysis of Deviance Table (Type II tests)
##
## Response: treatment
## LR Chisq Df Pr(>Chisq)
## family_history 32.90 1 9.708e-09 ***
## work_interfere 378.80 3 < 2.2e-16 ***
## benefits 19.24 2 6.644e-05 ***
## care_options 13.02 2 0.001488 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit1.roc <- roc(health.train$treatment, fit1$fitted, plot=F)
fit2.roc <- roc(health.train$treatment, fit2$fitted, plot=F)
fit3.roc <- roc(health.train$treatment, fit3$fitted, plot=F)
#Not much difference between the 3 fits.
plot(1-fit1.roc$specificities, fit1.roc$sensitivities, col="red", pch=16, cex=.7,
xlab="False Positive",
ylab="Sensitivity")
points(1-fit2.roc$specificities, fit2.roc$sensitivities, col="blue", pch=16, cex=.6)
points(1-fit3.roc$specificities, fit3.roc$sensitivities, col="green", pch=16, cex=.6)
title("Red is for fit1, blue is for fit2, and green is for fit3")
# roccurve <- roc(health.test$treatment ~ predict(fit3, health.test))
# plot(roccurve)
fit.pred <- rep("No", 1000)
fit.pred[fit3$fitted > 2/3]="Yes"
MCE = (sum((fit.pred[health.train$treatment == "Yes"] != "Yes"))
+ sum((fit.pred[health.train$treatment == "No"] != "No")))/length(health.train$treatment)
MCE #0.191
## [1] 0.191
success$percentages[success$methods == "Logistic Regression"] <- (100 - MCE*100)
Single tree:
set.seed(1)
fit.single <- randomForest(treatment~., health.train, mtry=2, ntree=1)
names(fit.single)
## [1] "call" "type" "predicted"
## [4] "err.rate" "confusion" "votes"
## [7] "oob.times" "classes" "importance"
## [10] "importanceSD" "localImportance" "proximity"
## [13] "ntree" "mtry" "forest"
## [16] "y" "test" "inbag"
## [19] "terms"
fit.single$mtry
## [1] 2
fit.single$votes[1:20, ] # prob of 0 and 1 using oob's
## No Yes
## 334 0 1
## 469 NaN NaN
## 720 0 1
## 1142 1 0
## 253 NaN NaN
## 1127 0 1
## 1185 1 0
## 828 1 0
## 787 NaN NaN
## 77 NaN NaN
## 257 0 1
## 220 NaN NaN
## 857 0 1
## 479 NaN NaN
## 958 1 0
## 619 0 1
## 892 NaN NaN
## 1233 NaN NaN
## 472 0 1
## 963 0 1
fit.single$predicted[1:20] # lables using oob's and majority vote. Notice those with NA because they are not in any OOB's
## 334 469 720 1142 253 1127 1185 828 787 77 257 220 857 479 958
## Yes <NA> Yes No <NA> Yes No No <NA> <NA> Yes <NA> Yes <NA> No
## 619 892 1233 472 963
## Yes <NA> <NA> Yes Yes
## Levels: No Yes
fit.single$err.rate[1,]["OOB"] # mis-classification errors of oob's/0/1
## OOB
## 0.3905817
predict(fit.single, health.test)[1:20] # prediction by using the RF based on all the training data.
## 6 11 12 20 28 33 36 39 41 46 50 55 59 68 73 75 76 80
## Yes Yes No No No No Yes Yes Yes No Yes No No Yes Yes No No No
## 84 88
## Yes No
## Levels: No Yes
data.frame(fit.single$votes[1:20, ], fit.single$predicted[1:20], predict(fit.single, health.test)[1:20] )
## No Yes fit.single.predicted.1.20.
## 334 0 1 Yes
## 469 NaN NaN <NA>
## 720 0 1 Yes
## 1142 1 0 No
## 253 NaN NaN <NA>
## 1127 0 1 Yes
## 1185 1 0 No
## 828 1 0 No
## 787 NaN NaN <NA>
## 77 NaN NaN <NA>
## 257 0 1 Yes
## 220 NaN NaN <NA>
## 857 0 1 Yes
## 479 NaN NaN <NA>
## 958 1 0 No
## 619 0 1 Yes
## 892 NaN NaN <NA>
## 1233 NaN NaN <NA>
## 472 0 1 Yes
## 963 0 1 Yes
## predict.fit.single..health.test..1.20.
## 334 Yes
## 469 Yes
## 720 No
## 1142 No
## 253 No
## 1127 No
## 1185 Yes
## 828 Yes
## 787 Yes
## 77 No
## 257 Yes
## 220 No
## 857 No
## 479 Yes
## 958 Yes
## 619 No
## 892 No
## 1233 No
## 472 Yes
## 963 No
success$percentages[success$methods == "Single Tree"] <- (100 - 100*fit.single$err.rate[1,]["OOB"])
Random forests:
health.rf <- train(treatment~., data=health.train, method="rf",metric="Accuracy", ntree=20)
plot(health.rf)
predict.rf <- predict(health.rf,health.test)
#Accuracy
confusionMatrix(predict.rf, health.test$treatment)$overall[1]
## Accuracy
## 0.7968127
success$percentages[success$methods == "Random Forest"] <- confusionMatrix(predict.rf, health.test$treatment)$overall[1]*100
Neural nets:
# Let us first calculate the number of hidden layers/nodes and the decay parameters
# size: number of intermediate hidden nodes
# decay: parameter to avoid overfitting
parameter <- train( treatment ~ . , data=health.train, method="nnet", trace=F)
size <- parameter$bestTune$size
decay <- parameter$bestTune$decay
# Neural net model:
model.nn <- nnet(treatment ~ ., size=size, decay=decay, trace=F, data=health.train)
predict.nn <- predict(model.nn, health.test, type = "class")
sum(predict.nn==y.test)/length(predict.nn) #Accuracy
## [1] 0.812749
success$percentages[success$methods == "Neural Nets"] <- confusionMatrix(predict.nn,health.test$treatment)$overall[1]*100
Bagging:
bag.model <- bagging(treatment ~ ., data=health.train)
predict.bag <- predict(bag.model, health.test, type="class")
confusionMatrix(predict.bag$class, health.test$treatment)$overall[1]
## Accuracy
## 0.8366534
success$percentages[success$methods == "Bagging"] <- confusionMatrix(predict.bag$class, health.test$treatment)$overall[1]*100
Lets plot our success rates for different methods:
success
## methods percentages
## 1 Logistic Regression 80.90000
## 2 Single Tree 60.94183
## 3 Random Forest 79.68127
## 4 Bagging 83.66534
## 5 Neural Nets 81.27490
ggplot(success, aes(x=methods, y=percentages)) + geom_bar(stat="identity", fill=c("yellowgreen", "hotpink2", "dodgerblue3", "orange2","Red"), width = 0.2) + coord_flip() + theme(legend.position = "none") + geom_text(aes(label = format(round(percentages, 2), nsmall = 2)), size = 3, hjust = 3, vjust = 3)